The libraries I will use:
library(data.table)
library(dplyr)
library(jpeg)
library(imager)
library(factoextra)
I read the data and apply PCA:
raw_data <- read.csv("/Users/ayberkakgun/Desktop/Musk1.csv") %>% as.data.frame()
features<-scale(raw_data[3:168])
labels<-raw_data[1:2]
colnames(labels)<-c("BagClass","BagId")
pca<-princomp(features)
str(pca)
## List of 7
## $ sdev : Named num [1:166] 7.19 4.8 3.55 2.9 2.86 ...
## ..- attr(*, "names")= chr [1:166] "Comp.1" "Comp.2" "Comp.3" "Comp.4" ...
## $ loadings: loadings [1:166, 1:166] -0.036844 -0.06835 -0.094764 0.092619 -0.000908 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:166] "X42" "X.198" "X.109" "X.75" ...
## .. ..$ : chr [1:166] "Comp.1" "Comp.2" "Comp.3" "Comp.4" ...
## $ center : Named num [1:166] 1.60e-16 -6.24e-17 3.54e-17 1.66e-17 -6.13e-17 ...
## ..- attr(*, "names")= chr [1:166] "X42" "X.198" "X.109" "X.75" ...
## $ scale : Named num [1:166] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "names")= chr [1:166] "X42" "X.198" "X.109" "X.75" ...
## $ n.obs : int 475
## $ scores : num [1:475, 1:166] -0.787 -0.255 -1.359 -1.464 -1.439 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr [1:166] "Comp.1" "Comp.2" "Comp.3" "Comp.4" ...
## $ call : language princomp(x = features)
## - attr(*, "class")= chr "princomp"
eigenvalue<-get_eigenvalue(pca)
head(eigenvalue,15)
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 51.747369 31.238880 31.23888
## Dim.2 23.049909 13.914782 45.15366
## Dim.3 12.628826 7.623777 52.77744
## Dim.4 8.400432 5.071178 57.84862
## Dim.5 8.164036 4.928469 62.77709
## Dim.6 6.755840 4.078369 66.85545
## Dim.7 5.364907 3.238690 70.09414
## Dim.8 5.040095 3.042607 73.13675
## Dim.9 3.302223 1.993488 75.13024
## Dim.10 2.849416 1.720137 76.85038
## Dim.11 2.570613 1.551829 78.40221
## Dim.12 2.375583 1.434093 79.83630
## Dim.13 2.205900 1.331659 81.16796
## Dim.14 2.107007 1.271959 82.43992
## Dim.15 1.812080 1.093917 83.53383
#a It takes 12 components to explain 80% of the variability and cumulative contribution gets smaller.
eig<-eigenvalue$variance.percent[eigenvalue$cumulative.variance.percent<85]
l<-length(eig)
ggplot()+
geom_col(aes(x=1:l,y=eig))+
geom_line(aes(x=1:l,y=eig,col="red"))
If we check the plot for two components, we see two dimensions are not enough to distinguish bag labels in space:
plot(x=pca$scores[,1],y=pca$scores[,2],col=raw_data$X1+1)
abline(h=0,v=0,lty=3)
I calculate distance matrix and perform MDS on it, without forgetting to scale the data. This plot confirms with another method, two components won’t be enough to explain variability and distinguish bag labels.
distanceMat<-dist(features,upper=T) %>% as.matrix()
mds<-cmdscale(distanceMat)
plot(mds[,1],mds[,2],
xlab='Component 1', ylab='Component 2',
col=raw_data$X1+1)
#b Once again perform PCA this time on aggregated data, this time 6 components are enough to explain 80% of variability. Hence the points are distributed more heterogeneously.
data_agg<-aggregate(raw_data,by=list(raw_data$X1.1),FUN=mean)
features_agg<-scale(data_agg[4:169])
pca_agg<-prcomp(features_agg)
str(pca_agg)
## List of 5
## $ sdev : num [1:92] 6.65 5.62 4.69 4.21 3.1 ...
## $ rotation: num [1:166, 1:92] -0.0536 -0.0653 -0.0607 0.0848 -0.0135 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:166] "X42" "X.198" "X.109" "X.75" ...
## .. ..$ : chr [1:92] "PC1" "PC2" "PC3" "PC4" ...
## $ center : Named num [1:166] -1.35e-16 -6.12e-17 -8.78e-17 -2.73e-17 -4.36e-17 ...
## ..- attr(*, "names")= chr [1:166] "X42" "X.198" "X.109" "X.75" ...
## $ scale : Named num [1:166] 18.1 76.8 57.4 66.2 20.5 ...
## ..- attr(*, "names")= chr [1:166] "X42" "X.198" "X.109" "X.75" ...
## $ x : num [1:92, 1:92] -1.63 -3.51 -4.39 -3.38 -5.25 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr [1:92] "PC1" "PC2" "PC3" "PC4" ...
## - attr(*, "class")= chr "prcomp"
eigenvalue_agg<-get_eigenvalue(pca_agg)
head(eigenvalue_agg,10)
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 44.241806 26.651690 26.65169
## Dim.2 31.636561 19.058170 45.70986
## Dim.3 21.961633 13.229899 58.93976
## Dim.4 17.709779 10.668542 69.60830
## Dim.5 9.609901 5.789097 75.39740
## Dim.6 7.555042 4.551230 79.94863
## Dim.7 5.042804 3.037834 82.98646
## Dim.8 3.553406 2.140606 85.12707
## Dim.9 3.268410 1.968922 87.09599
## Dim.10 2.567582 1.546736 88.64273
plot(x=pca_agg$x[,1],y=pca_agg$x[,2],
xlab="Component 1",
ylab="Component 2",
col=data_agg$X1+1)
Same MDS operation on aggregated data, results in a similar plot with PCA. Most of the red points stay in a triangular area of the space.
distanceMat_agg<-dist(features_agg,upper=T) %>% as.matrix()
mds_agg<-cmdscale(distanceMat_agg)
plot(mds_agg[,1],mds_agg[,2],
# main='Location',
xlab='', ylab='',
col=data_agg$X1+1)
#Task 2
I read my image, and plot it.
img <- readJPEG("/Users/ayberkakgun/Desktop/profil2.jpg")
plot(c(0, 256), c(0, 256), type = "n", xlab = "", ylab = "")
rasterImage(img, 0, 0, 256, 256)
I split the channels and plot them seperately.
r<-img[1:256,1:256,1]
g<-img[1:256,1:256,2]
b<-img[1:256,1:256,3]
red_palette <- colorRampPalette(c("black","red"))
blue_palette <- colorRampPalette(c("black","blue"))
green_palette <- colorRampPalette(c("black","green"))
image(r,col=red_palette(256),main="red",axes = F)
image(g,col=green_palette(256),main="green",axes=F)
image(b,col=blue_palette(256),main="blue",axes=F)
I add noise to my image by random distribution, control that with noise numbers don’t exceed 1 and display the image with noise.
minx<-min(img)
maxx<-max(img)*0.1
noise<-matrix(runif(65536,min=minx,max=maxx),256)
r2<-r+noise
r2[r2>1]=1
g2<-g+noise
g2[g2>1]=1
b2<-b+noise
b2[b2>1]=1
# img2 <- rgb(r2, g2, b2)
img2<-img
img2[,,1]<-r2
img2[,,2]<-g2
img2[,,3]<-b2
plot(c(0, 256), c(0, 256), type = "n", xlab = "", ylab = "")
rasterImage(img2, 0, 0, 256, 256)
I construct the patches and perform PCA on them.
imgr <- load.image("/Users/ayberkakgun/Desktop/profil2.jpg")
imgrr<-grayscale(imgr)
vectors <-matrix(nrow=53824,ncol = 625)
k<-1
for (i in 1:232){
for(j in 1:232){
vectors[k,]<-imgrr[i:(i+24),j:(j+24)] %>% as.vector()
k<-k+1
}
}
pca_img<-princomp(vectors)
eigenvalue_img<-get_eigenvalue(pca_img)
head(eigenvalue_img,5)
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 31.1588597 68.497106 68.49711
## Dim.2 4.3178490 9.492008 77.98911
## Dim.3 1.8995843 4.175892 82.16501
## Dim.4 0.9781981 2.150391 84.31540
## Dim.5 0.6798462 1.494519 85.80992
plot(eigenvalue_img$cumulative.variance.percent)
To plot the image, I first scale them between 0 and 1 and then construct images for first, second and third components.They give somewhat a good indication about image, better to worse when we go from first to third component; as expected.
score1 <- pca_img$scores[,1]
score2 <- pca_img$scores[,2]
score3 <- pca_img$scores[,3]
score1_scaled <- (score1 - min(score1)) / (max(score1)-min(score1))
score2_scaled <- (score2 - min(score2)) / (max(score2)-min(score2))
score3_scaled<- (score3 - min(score3)) / (max(score3)-min(score3))
imgp1 <- matrix(score1_scaled, 232, byrow=TRUE)
imgp2 <- matrix(score2_scaled, 232, byrow=TRUE)
imgp3 <- matrix(score3_scaled, 232, byrow=TRUE)
image(imgp1,col=gray.colors(256),main="",axes = F)
image(imgp2,col=gray.colors(256),main="",axes = F)
image(imgp3,col=gray.colors(256),main="",axes = F)
Same procedure for eigenvalues, which sure gives worst representation of the image; only an idea about where the object in the image is concentrated.
eig1<-pca_img$loadings[,1]
eig2<-pca_img$loadings[,2]
eig3<-pca_img$loadings[,3]
eig1_scaled <- (eig1 - min(eig1)) / (max(eig1)-min(eig1))
eig2_scaled <- (eig2 - min(eig2)) / (max(eig2)-min(eig2))
eig3_scaled<- (eig3 - min(eig3)) / (max(eig3)-min(eig3))
imge1 <- matrix(eig1_scaled, 25, byrow=TRUE)
imge2 <- matrix(eig2_scaled, 25, byrow=TRUE)
imge3 <- matrix(eig3_scaled, 25, byrow=TRUE)
image(imge1,col=gray.colors(256),main="",axes = F)
image(imge2,col=gray.colors(256),main="",axes = F)
image(imge3,col=gray.colors(256),main="",axes = F)